Data pre-processing is included, where special chars and minimal stop-words are removed
source("~/Dropbox/Eugenie/scripts/utils.R")
Load additional libraries
## for qdap
library(rJava)
library(qdapRegex)
library(qdapDictionaries)
library(qdapTools)
library(qdap)
## for syuzhet
library(syuzhet)
Get relevant columns
cols <- c('recid', 'item_id', 'user_id', 'text')
reviews2.text <- as.data.frame(reviews2.csv[, cols])
Clean text further with qdap methods
## text cleanning
check_text(reviews2.text$text)
reviews2.text$text <- replace_contraction(reviews2.text$text)
reviews2.text$text <- replace_number(reviews2.text$text)
reviews2.text$text <- add_missing_endmark(reviews2.text$text)
reviews2.text$text <- add_comma_space(reviews2.text$text)
Split each review into sentences. Note that this line took an hour to run on Eugenie’s laptop.
## takes 1 hour to split sentences
reviews2.text.qdap <- sentSplit(reviews2.text, "text")
Compute polarity scores at the sentence level for each review, and transform the result from a list to a data frame.
## this particular line takes 3 hours to run
pol.reviews2.text <- with(reviews2.text.qdap, polarity(text.var = text, grouping.var = recid))
## transfrom data structure for analysis
qdap.reviews2.text <- colsplit2df(scores(pol.reviews2.text))
Join the scores with more features from the raw data for further analysis
qdap.reviews2.text$recid <- as.numeric(qdap.reviews2.text$recid)
reviews2.csv$recid <- as.numeric(as.character(reviews2.csv$recid))
qdap.reviews2 <- merge(reviews2.csv[,c('recid','rating','text','incentivized','is_deleted','verified_purchaser')], qdap.reviews2.text, by='recid')
The script above computes the sentiment score at the sentence level for each review. We’re not executing it on the fly here because it takes roughly 5 hours in total to generate the result.
Load the pre-computed, joined sentiment score. This is what we would get from executing the script above
## read the sentiment analysis result (using qdap package)
qdap.reviews2 <- read.csv('~/Dropbox/Eugenie/data/processed/qdap-reviews2.csv')
qdap.reviews2[, c('incentivized','ave.polarity')] %>%
group_by(incentivized) %>%
summarize_all(mean, na.rm = TRUE)
## # A tibble: 2 x 2
## incentivized ave.polarity
## <fct> <dbl>
## 1 incentivized 0.277
## 2 non-incentivized 0.406
This result is different from the one we saw using ‘sentimentr’ package
## check the records with na values for the ave.polarity
knitr::kable(qdap.nas <- qdap.reviews2[is.na(qdap.reviews2$ave.polarity),],
caption = 'All reviews with NA ave.polarity score', floating.environment="sidewaystable")
| recid | rating | text | incentivized | is_deleted | verified_purchaser | total.sentences | total.words | ave.polarity | sd.polarity | stan.mean.polarity | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 2087 | 16848826 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 4746 | 23150116 | 5 | 100% | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 6594 | 23201012 | 5 | 100 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 8643 | 23996077 | 3 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 21857 | 30801468 | 5 | 10/10 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 30596 | 51835744 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 42318 | 74763670 | 4 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 44552 | 74770498 | 5 | 11 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 49557 | 81902483 | 1 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 51228 | 81911565 | 1 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 53309 | 84686504 | 2 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 54465 | 85581501 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 64531 | 94859873 | 5 | non-incentivized | deleted | unverified | 1 | 0 | NA | NA | NA | |
| 70497 | 106271781 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 76132 | 110745574 | 4 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 83608 | 131453099 | 5 | 100% | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 92087 | 164110427 | 5 | 8/10 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 108521 | 189686285 | 1 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 118391 | 249008323 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 124412 | 315873762 | 5 | non-incentivized | deleted | verified | 1 | 0 | NA | NA | NA | |
| 124531 | 317001064 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 124568 | 317001194 | 5 | non-incentivized | deleted | verified | 1 | 0 | NA | NA | NA | |
| 130525 | 353085494 | 1 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 139718 | 382134895 | 5 | 10/10 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 142926 | 385507230 | 5 | 10/10 | non-incentivized | deleted | verified | 1 | 0 | NA | NA | NA |
| 144768 | 391324730 | 4 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 144979 | 391724796 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 156411 | 411667863 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 156934 | 414143097 | 5 | <3 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 164624 | 424141828 | 5 | non-incentivized | kept | unverified | 1 | 0 | NA | NA | NA | |
| 177072 | 434539930 | 5 | 10/10 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 180298 | 434961431 | 5 | 10/10 | non-incentivized | deleted | verified | 1 | 0 | NA | NA | NA |
| 183032 | 439752882 | 5 | 10/10 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 183211 | 439753061 | 1 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 184384 | 439754748 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 191520 | 452192856 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 191834 | 452193735 | 5 | non-incentivized | kept | unverified | 1 | 0 | NA | NA | NA | |
| 200167 | 462442213 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 201569 | 462445788 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 202813 | 462818420 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 204158 | 463012977 | 2 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 205479 | 464876931 | 4 | 9/10 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 213132 | 474567469 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 216996 | 475698866 | 5 | 5/5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 219912 | 479412460 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 221172 | 483235704 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 221330 | 483236894 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 221789 | 483239957 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 227709 | 489373936 | 3 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 227775 | 489374364 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 230355 | 496972834 | 5 | 10/10 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 231391 | 496991042 | 5 | 10/10 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 232654 | 497936254 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 233054 | 499073153 | 5 | 5/5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 234626 | 500156831 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 238727 | 505518777 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 239232 | 505920173 | 1 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 239360 | 505921449 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 239750 | 506133439 | 1 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 240504 | 506153997 | 2 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 242307 | 507134614 | 5 | 10/10! | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
| 242868 | 507137938 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 248267 | 507949725 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 248640 | 507950207 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 253057 | 517250976 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 253708 | 517745389 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 255836 | 521918173 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA | |
| 258233 | 524910051 | 5 | non-incentivized | kept | verified | 1 | 0 | NA | NA | NA |
There’re some NA values for edge cases
Boxplot: rating vs. ave.polarity
## Warning: Ignoring 68 observations
Join with selected columns from the raw data
qdap.reviews2 <- merge(qdap.reviews2, reviews2.csv[,c('recid','item_id')])
## fix effect linear model
## Use sentence sentiment score to replce rating
formula.fe <- ave.polarity ~ incentivized + is_deleted + verified_purchaser
model.fe <- plm(data = qdap.reviews2, formula = formula.fe, index = c('item_id'), model = 'within')
# get the model summary
summary(model.fe)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = formula.fe, data = qdap.reviews2, model = "within",
## index = c("item_id"))
##
## Unbalanced Panel: n = 101, T = 29-10133, N = 263948
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -2.762940 -0.300630 -0.058203 0.237060 4.102307
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## incentivizednon-incentivized 0.0746480 0.0113962 6.5502 5.755e-11 ***
## is_deletedkept -0.0282067 0.0037962 -7.4303 1.087e-13 ***
## verified_purchaserverified 0.0397443 0.0038586 10.3002 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 54423
## Residual Sum of Squares: 54379
## R-Squared: 0.00080966
## Adj. R-Squared: 0.00041959
## F-statistic: 71.2656 on 3 and 263844 DF, p-value: < 2.22e-16
cor.test(qdap.reviews2$rating, qdap.reviews2$ave.polarity, method=c("pearson", "kendall", "spearman"))
##
## Pearson's product-moment correlation
##
## data: qdap.reviews2$rating and qdap.reviews2$ave.polarity
## t = 280.03, df = 263946, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4756451 0.4815274
## sample estimates:
## cor
## 0.4785916
The script down below executes a lot faster than the methods from ‘qdap’ package (said by the ‘sentimentr’ description and seen in this particular example as well). It took Eugenie’s laptop around 15 mins in total to execute all four options
syuzhet.reviews2 <- reviews2.text[,c('recid','rating','text')] %>%
mutate(syuzhet.sentiment = syuzhet::get_sentiment(reviews2.text$text, 'syuzhet')) %>%
mutate(afinn.sentiment = syuzhet::get_sentiment(syuzhet.reviews2$text, 'afinn')) %>%
mutate(nrc.sentiment = syuzhet::get_sentiment(syuzhet.reviews2$text, 'nrc')) %>%
mutate(bing.sentiment = syuzhet::get_sentiment(syuzhet.reviews2$text, 'bing'))
For the sake of time, we’re loading the pre-computed results
syuzhet.reviews2 <- read.csv('~/Dropbox/Eugenie/data/processed/syuzhet-reviews2.csv')
Here we’re showing a brief analysis for all four options ### Summary stats Since ‘syuzhet’ package utilize four lexicons, we’re showing all four summary stats here
syuzhet.reviews2[, c('incentivized','syuzhet.sentiment')] %>%
group_by(incentivized) %>%
summarize_all(mean, na.rm = TRUE)
## # A tibble: 2 x 2
## incentivized syuzhet.sentiment
## <fct> <dbl>
## 1 incentivized 6.83
## 2 non-incentivized 1.23
syuzhet.reviews2[, c('incentivized','afinn.sentiment')] %>%
group_by(incentivized) %>%
summarize_all(mean, na.rm = TRUE)
## # A tibble: 2 x 2
## incentivized afinn.sentiment
## <fct> <dbl>
## 1 incentivized 12.5
## 2 non-incentivized 2.78
syuzhet.reviews2[, c('incentivized','nrc.sentiment')] %>%
group_by(incentivized) %>%
summarize_all(mean, na.rm = TRUE)
## # A tibble: 2 x 2
## incentivized nrc.sentiment
## <fct> <dbl>
## 1 incentivized 6.68
## 2 non-incentivized 0.802
syuzhet.reviews2[, c('incentivized','bing.sentiment')] %>%
group_by(incentivized) %>%
summarize_all(mean, na.rm = TRUE)
## # A tibble: 2 x 2
## incentivized bing.sentiment
## <fct> <dbl>
## 1 incentivized 7.16
## 2 non-incentivized 1.60
All four lexicon options returned the same observation: the incentivized reviews have higher sentiment scores on average than the non-incentivized reviews
This result aligns with what we saw earlier from the ‘qdap’ package
There’s no NA values generated by these methods
Boxplot: rating vs. syuzhet.sentiment
Boxplot: rating vs. afinn.sentiment
Boxplot: rating vs. nrc.sentiment
Boxplot: rating vs. bing.sentiment
Join with selected columns from the raw data
syuzhet.reviews2 <- merge(syuzhet.reviews2, reviews2.csv[,c('recid','item_id')])
Here we’re showing the linear model for all four options
## fix effect linear model
## Use sentence sentiment score to replce rating
formula.fe <- syuzhet.sentiment ~ incentivized + is_deleted + verified_purchaser
model.fe <- plm(data = syuzhet.reviews2, formula = formula.fe, index = c('item_id'), model = 'within')
# get the model summary
summary(model.fe)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = formula.fe, data = syuzhet.reviews2, model = "within",
## index = c("item_id"))
##
## Unbalanced Panel: n = 101, T = 29-10134, N = 264016
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -10.05502 -0.77039 -0.19341 0.58073 28.34164
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## incentivizednon-incentivized -4.894152 0.035167 -139.169 < 2.2e-16 ***
## is_deletedkept -0.230489 0.011713 -19.678 < 2.2e-16 ***
## verified_purchaserverified -0.403659 0.011906 -33.904 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 574120
## Residual Sum of Squares: 517970
## R-Squared: 0.097792
## Adj. R-Squared: 0.09744
## F-statistic: 9535.34 on 3 and 263912 DF, p-value: < 2.22e-16
## fix effect linear model
## Use sentence sentiment score to replce rating
formula.fe <- afinn.sentiment ~ incentivized + is_deleted + verified_purchaser
model.fe <- plm(data = syuzhet.reviews2, formula = formula.fe, index = c('item_id'), model = 'within')
# get the model summary
summary(model.fe)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = formula.fe, data = syuzhet.reviews2, model = "within",
## index = c("item_id"))
##
## Unbalanced Panel: n = 101, T = 29-10134, N = 264016
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -30.112833 -2.315512 -0.037194 1.701766 52.469213
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## incentivizednon-incentivized -8.441802 0.089407 -94.419 < 2.2e-16 ***
## is_deletedkept -0.511688 0.029778 -17.183 < 2.2e-16 ***
## verified_purchaserverified -0.686430 0.030270 -22.677 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 3519300
## Residual Sum of Squares: 3348000
## R-Squared: 0.048664
## Adj. R-Squared: 0.048293
## F-statistic: 4500.04 on 3 and 263912 DF, p-value: < 2.22e-16
## fix effect linear model
## Use sentence sentiment score to replce rating
formula.fe <- nrc.sentiment ~ incentivized + is_deleted + verified_purchaser
model.fe <- plm(data = syuzhet.reviews2, formula = formula.fe, index = c('item_id'), model = 'within')
# get the model summary
summary(model.fe)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = formula.fe, data = syuzhet.reviews2, model = "within",
## index = c("item_id"))
##
## Unbalanced Panel: n = 101, T = 29-10134, N = 264016
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -11.143271 -0.925852 -0.080661 0.587376 32.544075
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## incentivizednon-incentivized -5.320651 0.040027 -132.928 < 2.2e-16 ***
## is_deletedkept -0.195991 0.013331 -14.701 < 2.2e-16 ***
## verified_purchaserverified -0.356488 0.013551 -26.306 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 733780
## Residual Sum of Squares: 671020
## R-Squared: 0.085524
## Adj. R-Squared: 0.085167
## F-statistic: 8227.26 on 3 and 263912 DF, p-value: < 2.22e-16
## fix effect linear model
## Use sentence sentiment score to replce rating
formula.fe <- bing.sentiment ~ incentivized + is_deleted + verified_purchaser
model.fe <- plm(data = syuzhet.reviews2, formula = formula.fe, index = c('item_id'), model = 'within')
# get the model summary
summary(model.fe)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = formula.fe, data = syuzhet.reviews2, model = "within",
## index = c("item_id"))
##
## Unbalanced Panel: n = 101, T = 29-10134, N = 264016
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -12.40467 -1.01461 -0.15270 0.87808 29.59394
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## incentivizednon-incentivized -4.904944 0.046670 -105.097 < 2.2e-16 ***
## is_deletedkept -0.299480 0.015544 -19.266 < 2.2e-16 ***
## verified_purchaserverified -0.377601 0.015801 -23.898 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 969570
## Residual Sum of Squares: 912270
## R-Squared: 0.0591
## Adj. R-Squared: 0.058733
## F-statistic: 5525.68 on 3 and 263912 DF, p-value: < 2.22e-16
cor.test(syuzhet.reviews2$rating, syuzhet.reviews2$syuzhet.sentiment, method=c("pearson", "kendall", "spearman"))
##
## Pearson's product-moment correlation
##
## data: syuzhet.reviews2$rating and syuzhet.reviews2$syuzhet.sentiment
## t = 143.17, df = 264014, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2648695 0.2719488
## sample estimates:
## cor
## 0.2684128
cor.test(syuzhet.reviews2$rating, syuzhet.reviews2$afinn.sentiment, method=c("pearson", "kendall", "spearman"))
##
## Pearson's product-moment correlation
##
## data: syuzhet.reviews2$rating and syuzhet.reviews2$afinn.sentiment
## t = 183.08, df = 264014, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3322494 0.3390189
## sample estimates:
## cor
## 0.3356385
cor.test(syuzhet.reviews2$rating, syuzhet.reviews2$nrc.sentiment, method=c("pearson", "kendall", "spearman"))
##
## Pearson's product-moment correlation
##
## data: syuzhet.reviews2$rating and syuzhet.reviews2$nrc.sentiment
## t = 90.225, df = 264014, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1692463 0.1766471
## sample estimates:
## cor
## 0.1729491
cor.test(syuzhet.reviews2$rating, syuzhet.reviews2$bing.sentiment, method=c("pearson", "kendall", "spearman"))
##
## Pearson's product-moment correlation
##
## data: syuzhet.reviews2$rating and syuzhet.reviews2$bing.sentiment
## t = 196.88, df = 264014, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3544634 0.3611157
## sample estimates:
## cor
## 0.3577941